PROJECT SUMMARY:

With the chosen dataset, I aim to find out if a bank customer’s loan would be approved or not. For this purpose, I will use classification algorithms (K-NN & Naive Bayes) and compare the results from both analyses and make an interpretation as to which one is accurate and suitable for this dataset.


DETAILS:

  1. Number of rows - The dataset has 614 values of Rows.
  2. How many columns - The dataset has 13 columns.
  3. Target Variable - The target variable is Loan Status
  4. How many categorical variables and how many numerical ones are among your features. - There are 7 categorical and 6 numerical variables in the dataset.

# LOADING THE DATASET:
Training_dataset <- read.csv("/Users/ajithrajperiyasamy/Desktop/FILES/KSU FILES/CAPSTONE/BANK LOAN/CODING/Training dataset.csv")
Testing_dataset <- read.csv("/Users/ajithrajperiyasamy/Desktop/FILES/KSU FILES/CAPSTONE/BANK LOAN/CODING/Testing dataset.csv")
head(Training_dataset) #Displays the first 6 values of each columns 
##    Loan_ID Gender Married Dependents    Education Self_Employed ApplicantIncome
## 1 LP001002   Male      No          0     Graduate            No            5849
## 2 LP001003   Male     Yes          1     Graduate            No            4583
## 3 LP001005   Male     Yes          0     Graduate           Yes            3000
## 4 LP001006   Male     Yes          0 Not Graduate            No            2583
## 5 LP001008   Male      No          0     Graduate            No            6000
## 6 LP001011   Male     Yes          2     Graduate           Yes            5417
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 1                 0         NA              360              1         Urban
## 2              1508        128              360              1         Rural
## 3                 0         66              360              1         Urban
## 4              2358        120              360              1         Urban
## 5                 0        141              360              1         Urban
## 6              4196        267              360              1         Urban
##   Loan_Status
## 1           Y
## 2           N
## 3           Y
## 4           Y
## 5           Y
## 6           Y
str(Training_dataset) #Displays the structure of the dataset.
## 'data.frame':    614 obs. of  13 variables:
##  $ Loan_ID          : chr  "LP001002" "LP001003" "LP001005" "LP001006" ...
##  $ Gender           : chr  "Male" "Male" "Male" "Male" ...
##  $ Married          : chr  "No" "Yes" "Yes" "Yes" ...
##  $ Dependents       : chr  "0" "1" "0" "0" ...
##  $ Education        : chr  "Graduate" "Graduate" "Graduate" "Not Graduate" ...
##  $ Self_Employed    : chr  "No" "No" "Yes" "No" ...
##  $ ApplicantIncome  : int  5849 4583 3000 2583 6000 5417 2333 3036 4006 12841 ...
##  $ CoapplicantIncome: num  0 1508 0 2358 0 ...
##  $ LoanAmount       : int  NA 128 66 120 141 267 95 158 168 349 ...
##  $ Loan_Amount_Term : int  360 360 360 360 360 360 360 360 360 360 ...
##  $ Credit_History   : int  1 1 1 1 1 1 1 0 1 1 ...
##  $ Property_Area    : chr  "Urban" "Rural" "Urban" "Urban" ...
##  $ Loan_Status      : chr  "Y" "N" "Y" "Y" ...

# Missing Value:
total_missing <- sum(is.na(Training_dataset))
total_missing
## [1] 86
total_cells <- nrow(Training_dataset)*ncol(Training_dataset)
total_cells
## [1] 7982
percent_missing <- (total_missing/total_cells)*100
percent_missing
## [1] 1.077424
print(paste("Percentage of missing values in the dataset:",percent_missing,"%")) # Since the percentage of missing values is just 1%, we dont need to impute the missing values, rather we can omit them, because they would not affect our results.
## [1] "Percentage of missing values in the dataset: 1.07742420446004 %"
training_dataset <- na.omit(Training_dataset)

Performing Descreptive statistics to better understand the dataset.
# DESCRIPTIVE AND BASIC STATISTICS:
# 1.Summary Statistics:
summary(Training_dataset) # Summary gives us an idea about the mean, median, maximum and minimum value of all the variables belonging to the dataset.
##    Loan_ID             Gender            Married           Dependents       
##  Length:614         Length:614         Length:614         Length:614        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Education         Self_Employed      ApplicantIncome CoapplicantIncome
##  Length:614         Length:614         Min.   :  150   Min.   :    0    
##  Class :character   Class :character   1st Qu.: 2878   1st Qu.:    0    
##  Mode  :character   Mode  :character   Median : 3812   Median : 1188    
##                                        Mean   : 5403   Mean   : 1621    
##                                        3rd Qu.: 5795   3rd Qu.: 2297    
##                                        Max.   :81000   Max.   :41667    
##                                                                         
##    LoanAmount    Loan_Amount_Term Credit_History   Property_Area     
##  Min.   :  9.0   Min.   : 12      Min.   :0.0000   Length:614        
##  1st Qu.:100.0   1st Qu.:360      1st Qu.:1.0000   Class :character  
##  Median :128.0   Median :360      Median :1.0000   Mode  :character  
##  Mean   :146.4   Mean   :342      Mean   :0.8422                     
##  3rd Qu.:168.0   3rd Qu.:360      3rd Qu.:1.0000                     
##  Max.   :700.0   Max.   :480      Max.   :1.0000                     
##  NA's   :22      NA's   :14       NA's   :50                         
##  Loan_Status       
##  Length:614        
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

# 2. Scatter plot:
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Identify numeric columns
numeric_vars <- Training_dataset %>%
  select_if(is.numeric) %>%
  names()
numeric_vars
## [1] "ApplicantIncome"   "CoapplicantIncome" "LoanAmount"       
## [4] "Loan_Amount_Term"  "Credit_History"
library(ggplot2)

# Pairwise scatter plots for numeric variables
for (i in 1:(length(numeric_vars)-1)) {
  for (j in (i+1):length(numeric_vars)) {
    p <- ggplot(Training_dataset, aes_string(x = numeric_vars[i], y = numeric_vars[j], color = "Loan_Status")) +
      geom_point(alpha = 0.6) +
      labs(title = paste(numeric_vars[i], "vs", numeric_vars[j]),
           x = numeric_vars[i], y = numeric_vars[j]) +
      theme_minimal()
    print(p)
  }
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 22 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 50 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 36 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 71 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 64 rows containing missing values or values outside the scale range
## (`geom_point()`).


# 3. BARPLOT:
barplot <- barplot(table(Training_dataset$Loan_Status), 
        main = "Loan Status",
        xlab = "Decision",
        ylab = "Frequency")


# 4. BOXPLOT:
# Box plot for Applicant Income vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = ApplicantIncome)) +
  geom_boxplot() +
  labs(title = "Box Plot of Applicant Income by Loan Status", x = "Loan Status", y = "ApplicantIncome")

# Box plot for Co Applicant Income vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = CoapplicantIncome)) +
  geom_boxplot() +
  labs(title = "Box Plot of Coapplicant Income by Loan Status", x = "Loan Status", y = "CoapplicantIncome")

# Box plot for Loan Amount vs Loan Status
ggplot(training_dataset, aes(x = Loan_Status, y = LoanAmount)) +
  geom_boxplot() +
  labs(title = "Box Plot of Loan Amount by Loan Status", x = "Loan Status", y = "LoanAmount")


# 5. HISTOGRAM:
library(ggplot2)

# Age distribution
ggplot(training_dataset, aes(x = ApplicantIncome)) + 
  geom_histogram(binwidth = 1000, fill="lightblue", color="black") + 
  labs(title="Applicant Income Distribution", x="ApplicantIncome", y="Count")

# Income distribution
ggplot(training_dataset, aes(x = CoapplicantIncome)) + 
  geom_histogram(binwidth = 1000, fill="lightgreen", color="black") + 
  labs(title="Co-applicant Income Distribution", x="CoapplicantIncome", y="Count")

# Loan amount distribution
ggplot(training_dataset, aes(x = LoanAmount)) + 
  geom_histogram(binwidth = 500, fill="lightcoral", color="black") + 
  labs(title="Loan Amount Distribution", x="LoanAmount", y="Count")

# Credit History distribution
ggplot(training_dataset, aes(x = Credit_History)) + 
  geom_histogram(binwidth = 10, fill="lightgoldenrod", color="black") + 
  labs(title="Credit History Distribution", x="Credit History", y="Count")


In this step, we aim to convert all categorical variable columns into dummy varibales using ‘One-Hot-Encoding’
# CONVERTING CATEGORICAL VARIABLES TO NUMERIC BY ONE-HOT ENCODING:
library(caret)
## Loading required package: lattice
head(training_dataset)
##    Loan_ID Gender Married Dependents    Education Self_Employed ApplicantIncome
## 2 LP001003   Male     Yes          1     Graduate            No            4583
## 3 LP001005   Male     Yes          0     Graduate           Yes            3000
## 4 LP001006   Male     Yes          0 Not Graduate            No            2583
## 5 LP001008   Male      No          0     Graduate            No            6000
## 6 LP001011   Male     Yes          2     Graduate           Yes            5417
## 7 LP001013   Male     Yes          0 Not Graduate            No            2333
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 2              1508        128              360              1         Rural
## 3                 0         66              360              1         Urban
## 4              2358        120              360              1         Urban
## 5                 0        141              360              1         Urban
## 6              4196        267              360              1         Urban
## 7              1516         95              360              1         Urban
##   Loan_Status
## 2           N
## 3           Y
## 4           Y
## 5           Y
## 6           Y
## 7           Y
dummy_gender <- dummyVars(~Gender, data=training_dataset)
dummy_Married <- dummyVars(~Married, data=training_dataset)
dummy_Education <- dummyVars(~Education, data=training_dataset)
dummy_Self_Employed <- dummyVars(~Self_Employed, data=training_dataset)
dummy_Property_Area <- dummyVars(~Property_Area, data=training_dataset)
dummy_Loan_Status<- dummyVars(~Loan_Status, data=training_dataset)
dummy_Dependents<- dummyVars(~Dependents, data=training_dataset)
encoded_training_dataset <- cbind(training_dataset,
                                  predict(dummy_gender,training_dataset),
                                  predict(dummy_Married,training_dataset),
                                  predict(dummy_Education,training_dataset),
                                  predict(dummy_Self_Employed,training_dataset),
                                  predict(dummy_Property_Area,training_dataset),
                                  predict(dummy_Loan_Status,training_dataset),
                                  predict(dummy_Dependents,training_dataset))
head(encoded_training_dataset)
##    Loan_ID Gender Married Dependents    Education Self_Employed ApplicantIncome
## 2 LP001003   Male     Yes          1     Graduate            No            4583
## 3 LP001005   Male     Yes          0     Graduate           Yes            3000
## 4 LP001006   Male     Yes          0 Not Graduate            No            2583
## 5 LP001008   Male      No          0     Graduate            No            6000
## 6 LP001011   Male     Yes          2     Graduate           Yes            5417
## 7 LP001013   Male     Yes          0 Not Graduate            No            2333
##   CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History Property_Area
## 2              1508        128              360              1         Rural
## 3                 0         66              360              1         Urban
## 4              2358        120              360              1         Urban
## 5                 0        141              360              1         Urban
## 6              4196        267              360              1         Urban
## 7              1516         95              360              1         Urban
##   Loan_Status Gender GenderFemale GenderMale Married MarriedNo MarriedYes
## 2           N      0            0          1       0         0          1
## 3           Y      0            0          1       0         0          1
## 4           Y      0            0          1       0         0          1
## 5           Y      0            0          1       0         1          0
## 6           Y      0            0          1       0         0          1
## 7           Y      0            0          1       0         0          1
##   EducationGraduate EducationNot Graduate Self_Employed Self_EmployedNo
## 2                 1                     0             0               1
## 3                 1                     0             0               0
## 4                 0                     1             0               1
## 5                 1                     0             0               1
## 6                 1                     0             0               0
## 7                 0                     1             0               1
##   Self_EmployedYes Property_AreaRural Property_AreaSemiurban Property_AreaUrban
## 2                0                  1                      0                  0
## 3                1                  0                      0                  1
## 4                0                  0                      0                  1
## 5                0                  0                      0                  1
## 6                1                  0                      0                  1
## 7                0                  0                      0                  1
##   Loan_StatusN Loan_StatusY Dependents Dependents0 Dependents1 Dependents2
## 2            1            0          0           0           1           0
## 3            0            1          0           1           0           0
## 4            0            1          0           1           0           0
## 5            0            1          0           1           0           0
## 6            0            1          0           0           0           1
## 7            0            1          0           1           0           0
##   Dependents3+
## 2            0
## 3            0
## 4            0
## 5            0
## 6            0
## 7            0

remove the converted columns
encoded_training_dataset <- encoded_training_dataset[, -c(1,2,3,4,5,6,12,13)]
head(encoded_training_dataset)
##   ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2            4583              1508        128              360              1
## 3            3000                 0         66              360              1
## 4            2583              2358        120              360              1
## 5            6000                 0        141              360              1
## 6            5417              4196        267              360              1
## 7            2333              1516         95              360              1
##   Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2      0            0          1       0         0          1                 1
## 3      0            0          1       0         0          1                 1
## 4      0            0          1       0         0          1                 0
## 5      0            0          1       0         1          0                 1
## 6      0            0          1       0         0          1                 1
## 7      0            0          1       0         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2                     0             0               1                0
## 3                     0             0               0                1
## 4                     1             0               1                0
## 5                     0             0               1                0
## 6                     0             0               0                1
## 7                     1             0               1                0
##   Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## 2                  1                      0                  0            1
## 3                  0                      0                  1            0
## 4                  0                      0                  1            0
## 5                  0                      0                  1            0
## 6                  0                      0                  1            0
## 7                  0                      0                  1            0
##   Loan_StatusY Dependents Dependents0 Dependents1 Dependents2 Dependents3+
## 2            0          0           0           1           0            0
## 3            1          0           1           0           0            0
## 4            1          0           1           0           0            0
## 5            1          0           1           0           0            0
## 6            1          0           0           0           1            0
## 7            1          0           1           0           0            0

Normalizing the entire dataset using preProcess function to bring all the variables to a common scale, for unbiased analysis.
# NORMALIZING THE DATA:
summary(encoded_training_dataset)
##  ApplicantIncome CoapplicantIncome   LoanAmount    Loan_Amount_Term
##  Min.   :  150   Min.   :    0     Min.   :  9.0   Min.   : 36.0   
##  1st Qu.: 2900   1st Qu.:    0     1st Qu.:100.0   1st Qu.:360.0   
##  Median : 3816   Median : 1086     Median :128.0   Median :360.0   
##  Mean   : 5508   Mean   : 1542     Mean   :145.9   Mean   :342.4   
##  3rd Qu.: 5815   3rd Qu.: 2232     3rd Qu.:167.0   3rd Qu.:360.0   
##  Max.   :81000   Max.   :33837     Max.   :700.0   Max.   :480.0   
##  Credit_History       Gender         GenderFemale      GenderMale    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :1.0000   Median :0.00000   Median :0.0000   Median :1.0000  
##  Mean   :0.8507   Mean   :0.02268   Mean   :0.1796   Mean   :0.7977  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##     Married           MarriedNo        MarriedYes     EducationGraduate
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :0.000000   Median :0.0000   Median :1.0000   Median :1.0000   
##  Mean   :0.003781   Mean   :0.3554   Mean   :0.6408   Mean   :0.7958   
##  3rd Qu.:0.000000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  EducationNot Graduate Self_Employed     Self_EmployedNo  Self_EmployedYes
##  Min.   :0.0000        Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000        1st Qu.:0.00000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.0000        Median :0.00000   Median :1.0000   Median :0.0000  
##  Mean   :0.2042        Mean   :0.04726   Mean   :0.8204   Mean   :0.1323  
##  3rd Qu.:0.0000        3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000        Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##  Property_AreaRural Property_AreaSemiurban Property_AreaUrban  Loan_StatusN   
##  Min.   :0.000      Min.   :0.0000         Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.000      1st Qu.:0.0000         1st Qu.:0.0000     1st Qu.:0.0000  
##  Median :0.000      Median :0.0000         Median :0.0000     Median :0.0000  
##  Mean   :0.293      Mean   :0.3951         Mean   :0.3119     Mean   :0.3081  
##  3rd Qu.:1.000      3rd Qu.:1.0000         3rd Qu.:1.0000     3rd Qu.:1.0000  
##  Max.   :1.000      Max.   :1.0000         Max.   :1.0000     Max.   :1.0000  
##   Loan_StatusY      Dependents       Dependents0      Dependents1    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000   Median :1.0000   Median :0.0000  
##  Mean   :0.6919   Mean   :0.02268   Mean   :0.5577   Mean   :0.1607  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##   Dependents2      Dependents3+    
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000  
##  Mean   :0.1739   Mean   :0.08507  
##  3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000
encoded_training_dataset_norm <- preProcess(encoded_training_dataset,method = c('range'))
normalized_training_dataset <- predict(encoded_training_dataset_norm,encoded_training_dataset)
head(normalized_training_dataset)
##   ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2      0.05482993        0.04456660 0.17221418        0.7297297              1
## 3      0.03525046        0.00000000 0.08248915        0.7297297              1
## 4      0.03009276        0.06968703 0.16063676        0.7297297              1
## 5      0.07235622        0.00000000 0.19102750        0.7297297              1
## 6      0.06514533        0.12400627 0.37337192        0.7297297              1
## 7      0.02700062        0.04480303 0.12445731        0.7297297              1
##   Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2      0            0          1       0         0          1                 1
## 3      0            0          1       0         0          1                 1
## 4      0            0          1       0         0          1                 0
## 5      0            0          1       0         1          0                 1
## 6      0            0          1       0         0          1                 1
## 7      0            0          1       0         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2                     0             0               1                0
## 3                     0             0               0                1
## 4                     1             0               1                0
## 5                     0             0               1                0
## 6                     0             0               0                1
## 7                     1             0               1                0
##   Property_AreaRural Property_AreaSemiurban Property_AreaUrban Loan_StatusN
## 2                  1                      0                  0            1
## 3                  0                      0                  1            0
## 4                  0                      0                  1            0
## 5                  0                      0                  1            0
## 6                  0                      0                  1            0
## 7                  0                      0                  1            0
##   Loan_StatusY Dependents Dependents0 Dependents1 Dependents2 Dependents3+
## 2            0          0           0           1           0            0
## 3            1          0           1           0           0            0
## 4            1          0           1           0           0            0
## 5            1          0           1           0           0            0
## 6            1          0           0           0           1            0
## 7            1          0           1           0           0            0
summary(normalized_training_dataset)
##  ApplicantIncome   CoapplicantIncome   LoanAmount     Loan_Amount_Term
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.03401   1st Qu.:0.00000   1st Qu.:0.1317   1st Qu.:0.7297  
##  Median :0.04534   Median :0.03209   Median :0.1722   Median :0.7297  
##  Mean   :0.06627   Mean   :0.04558   Mean   :0.1981   Mean   :0.6900  
##  3rd Qu.:0.07007   3rd Qu.:0.06596   3rd Qu.:0.2287   3rd Qu.:0.7297  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##  Credit_History       Gender         GenderFemale      GenderMale    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :1.0000   Median :0.00000   Median :0.0000   Median :1.0000  
##  Mean   :0.8507   Mean   :0.02268   Mean   :0.1796   Mean   :0.7977  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##     Married           MarriedNo        MarriedYes     EducationGraduate
##  Min.   :0.000000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :0.000000   Median :0.0000   Median :1.0000   Median :1.0000   
##  Mean   :0.003781   Mean   :0.3554   Mean   :0.6408   Mean   :0.7958   
##  3rd Qu.:0.000000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.000000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   
##  EducationNot Graduate Self_Employed     Self_EmployedNo  Self_EmployedYes
##  Min.   :0.0000        Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000        1st Qu.:0.00000   1st Qu.:1.0000   1st Qu.:0.0000  
##  Median :0.0000        Median :0.00000   Median :1.0000   Median :0.0000  
##  Mean   :0.2042        Mean   :0.04726   Mean   :0.8204   Mean   :0.1323  
##  3rd Qu.:0.0000        3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000        Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##  Property_AreaRural Property_AreaSemiurban Property_AreaUrban  Loan_StatusN   
##  Min.   :0.000      Min.   :0.0000         Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.000      1st Qu.:0.0000         1st Qu.:0.0000     1st Qu.:0.0000  
##  Median :0.000      Median :0.0000         Median :0.0000     Median :0.0000  
##  Mean   :0.293      Mean   :0.3951         Mean   :0.3119     Mean   :0.3081  
##  3rd Qu.:1.000      3rd Qu.:1.0000         3rd Qu.:1.0000     3rd Qu.:1.0000  
##  Max.   :1.000      Max.   :1.0000         Max.   :1.0000     Max.   :1.0000  
##   Loan_StatusY      Dependents       Dependents0      Dependents1    
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.00000   Median :1.0000   Median :0.0000  
##  Mean   :0.6919   Mean   :0.02268   Mean   :0.5577   Mean   :0.1607  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000   Max.   :1.0000  
##   Dependents2      Dependents3+    
##  Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000  
##  Mean   :0.1739   Mean   :0.08507  
##  3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000

Now, we are moving to the variable selection process. The idea is to use Corrplot, Backward Stepwise Regression, and PCA to perform variable selection.
# FEATURE SELECTION (VARIABLE SELECTION)
# 1. Corrplot:
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(normalized_training_dataset[,sapply(normalized_training_dataset,is.numeric)],use = "complete.obs")
corrplot(cor_matrix, method = "circle",tl.cex=0.7)


Performed PCA for all variables to filter out the important variables for our analysis.
# 2. PCA FOR ALL VARIABLES:
library(FactoMineR)
PCA(normalized_training_dataset)

## Warning: ggrepel: 12 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 529 individuals, described by 26 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

Combining Loan Status - Yes or No into one single column
normalized_training_dataset$Loan_Status<- ifelse(normalized_training_dataset$Loan_StatusN == 1, 0, 1) 
# Drop the original Loan Status columns:
normalized_training_dataset <-normalized_training_dataset[, -c(20,21)]
# Checking names of columns to ensure changes have been made:
colnames(normalized_training_dataset)
##  [1] "ApplicantIncome"        "CoapplicantIncome"      "LoanAmount"            
##  [4] "Loan_Amount_Term"       "Credit_History"         "Gender"                
##  [7] "GenderFemale"           "GenderMale"             "Married"               
## [10] "MarriedNo"              "MarriedYes"             "EducationGraduate"     
## [13] "EducationNot Graduate"  "Self_Employed"          "Self_EmployedNo"       
## [16] "Self_EmployedYes"       "Property_AreaRural"     "Property_AreaSemiurban"
## [19] "Property_AreaUrban"     "Dependents"             "Dependents0"           
## [22] "Dependents1"            "Dependents2"            "Dependents3+"          
## [25] "Loan_Status"
head(normalized_training_dataset)
##   ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2      0.05482993        0.04456660 0.17221418        0.7297297              1
## 3      0.03525046        0.00000000 0.08248915        0.7297297              1
## 4      0.03009276        0.06968703 0.16063676        0.7297297              1
## 5      0.07235622        0.00000000 0.19102750        0.7297297              1
## 6      0.06514533        0.12400627 0.37337192        0.7297297              1
## 7      0.02700062        0.04480303 0.12445731        0.7297297              1
##   Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2      0            0          1       0         0          1                 1
## 3      0            0          1       0         0          1                 1
## 4      0            0          1       0         0          1                 0
## 5      0            0          1       0         1          0                 1
## 6      0            0          1       0         0          1                 1
## 7      0            0          1       0         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2                     0             0               1                0
## 3                     0             0               0                1
## 4                     1             0               1                0
## 5                     0             0               1                0
## 6                     0             0               0                1
## 7                     1             0               1                0
##   Property_AreaRural Property_AreaSemiurban Property_AreaUrban Dependents
## 2                  1                      0                  0          0
## 3                  0                      0                  1          0
## 4                  0                      0                  1          0
## 5                  0                      0                  1          0
## 6                  0                      0                  1          0
## 7                  0                      0                  1          0
##   Dependents0 Dependents1 Dependents2 Dependents3+ Loan_Status
## 2           0           1           0            0           0
## 3           1           0           0            0           1
## 4           1           0           0            0           1
## 5           1           0           0            0           1
## 6           0           0           1            0           1
## 7           1           0           0            0           1

Performed Backward Stepwise regression to identify the most significant input variables.
# 3. Stepwise Regression:
# Load necessary library
library(MASS) 
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Fit initial linear regression model with all predictors
initial_model <- lm(Loan_Status ~ ., data = normalized_training_dataset)

# Perform backward elimination for variable selection
final_model <- step(initial_model, direction = "backward")
## Start:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate + 
##     `EducationNot Graduate` + Self_Employed + Self_EmployedNo + 
##     Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban + 
##     Property_AreaUrban + Dependents + Dependents0 + Dependents1 + 
##     Dependents2 + `Dependents3+`
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate + 
##     `EducationNot Graduate` + Self_Employed + Self_EmployedNo + 
##     Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban + 
##     Property_AreaUrban + Dependents + Dependents0 + Dependents1 + 
##     Dependents2
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate + 
##     `EducationNot Graduate` + Self_Employed + Self_EmployedNo + 
##     Self_EmployedYes + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1 + Dependents2
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate + 
##     `EducationNot Graduate` + Self_Employed + Self_EmployedNo + 
##     Property_AreaRural + Property_AreaSemiurban + Dependents + 
##     Dependents0 + Dependents1 + Dependents2
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + MarriedYes + EducationGraduate + 
##     Self_Employed + Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1 + Dependents2
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     GenderMale + Married + MarriedNo + EducationGraduate + Self_Employed + 
##     Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1 + Dependents2
## 
## 
## Step:  AIC=-1000.23
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     Married + MarriedNo + EducationGraduate + Self_Employed + 
##     Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1 + Dependents2
## 
##                          Df Sum of Sq     RSS      AIC
## - Dependents2             1    0.0036  74.323 -1002.20
## - Self_EmployedNo         1    0.0201  74.339 -1002.08
## - ApplicantIncome         1    0.0302  74.349 -1002.01
## - Gender                  1    0.0404  74.359 -1001.94
## - Dependents              1    0.0408  74.360 -1001.94
## - Married                 1    0.0502  74.369 -1001.87
## - Dependents0             1    0.0834  74.402 -1001.63
## - Loan_Amount_Term        1    0.1048  74.424 -1001.48
## - Property_AreaRural      1    0.1360  74.455 -1001.26
## - GenderFemale            1    0.1408  74.460 -1001.23
## - CoapplicantIncome       1    0.1746  74.494 -1000.98
## - LoanAmount              1    0.2009  74.520 -1000.80
## - Dependents1             1    0.2193  74.538 -1000.67
## - Self_Employed           1    0.2261  74.545 -1000.62
## <none>                                 74.319 -1000.23
## - EducationGraduate       1    0.3005  74.620 -1000.09
## - MarriedNo               1    0.5270  74.846  -998.49
## - Property_AreaSemiurban  1    1.0786  75.398  -994.60
## - Credit_History          1   31.1879 105.507  -816.86
## 
## Step:  AIC=-1002.2
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     Married + MarriedNo + EducationGraduate + Self_Employed + 
##     Self_EmployedNo + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Self_EmployedNo         1    0.0209  74.344 -1004.05
## - ApplicantIncome         1    0.0328  74.356 -1003.97
## - Dependents              1    0.0374  74.360 -1003.93
## - Gender                  1    0.0393  74.362 -1003.92
## - Married                 1    0.0504  74.373 -1003.84
## - Loan_Amount_Term        1    0.1071  74.430 -1003.44
## - Property_AreaRural      1    0.1333  74.456 -1003.25
## - Dependents0             1    0.1341  74.457 -1003.25
## - GenderFemale            1    0.1419  74.465 -1003.19
## - CoapplicantIncome       1    0.1763  74.499 -1002.95
## - LoanAmount              1    0.1992  74.522 -1002.78
## - Self_Employed           1    0.2253  74.548 -1002.60
## <none>                                 74.323 -1002.20
## - EducationGraduate       1    0.2995  74.622 -1002.07
## - Dependents1             1    0.3320  74.655 -1001.84
## - MarriedNo               1    0.5249  74.848 -1000.48
## - Property_AreaSemiurban  1    1.0882  75.411  -996.51
## - Credit_History          1   31.1959 105.519  -818.80
## 
## Step:  AIC=-1004.05
## Loan_Status ~ ApplicantIncome + CoapplicantIncome + LoanAmount + 
##     Loan_Amount_Term + Credit_History + Gender + GenderFemale + 
##     Married + MarriedNo + EducationGraduate + Self_Employed + 
##     Property_AreaRural + Property_AreaSemiurban + Dependents + 
##     Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - ApplicantIncome         1    0.0288  74.372 -1005.85
## - Dependents              1    0.0367  74.380 -1005.79
## - Gender                  1    0.0414  74.385 -1005.76
## - Married                 1    0.0512  74.395 -1005.69
## - Loan_Amount_Term        1    0.1044  74.448 -1005.31
## - Dependents0             1    0.1307  74.474 -1005.12
## - Property_AreaRural      1    0.1389  74.483 -1005.06
## - GenderFemale            1    0.1435  74.487 -1005.03
## - CoapplicantIncome       1    0.1782  74.522 -1004.78
## - LoanAmount              1    0.2035  74.547 -1004.61
## - Self_Employed           1    0.2128  74.556 -1004.54
## <none>                                 74.344 -1004.05
## - EducationGraduate       1    0.3020  74.646 -1003.91
## - Dependents1             1    0.3422  74.686 -1003.62
## - MarriedNo               1    0.5287  74.872 -1002.30
## - Property_AreaSemiurban  1    1.0850  75.429  -998.39
## - Credit_History          1   31.1889 105.533  -820.73
## 
## Step:  AIC=-1005.85
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + Gender + GenderFemale + Married + MarriedNo + 
##     EducationGraduate + Self_Employed + Property_AreaRural + 
##     Property_AreaSemiurban + Dependents + Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Gender                  1    0.0371  74.410 -1007.58
## - Dependents              1    0.0384  74.411 -1007.57
## - Married                 1    0.0504  74.423 -1007.49
## - Loan_Amount_Term        1    0.1162  74.489 -1007.02
## - Dependents0             1    0.1314  74.504 -1006.91
## - Property_AreaRural      1    0.1446  74.517 -1006.82
## - GenderFemale            1    0.1478  74.520 -1006.80
## - LoanAmount              1    0.1881  74.561 -1006.51
## - Self_Employed           1    0.2195  74.592 -1006.29
## - CoapplicantIncome       1    0.2341  74.607 -1006.18
## <none>                                 74.372 -1005.85
## - EducationGraduate       1    0.3143  74.687 -1005.62
## - Dependents1             1    0.3437  74.716 -1005.41
## - MarriedNo               1    0.5185  74.891 -1004.17
## - Property_AreaSemiurban  1    1.0728  75.445 -1000.27
## - Credit_History          1   31.1615 105.534  -822.72
## 
## Step:  AIC=-1007.58
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + GenderFemale + Married + MarriedNo + EducationGraduate + 
##     Self_Employed + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents + Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Dependents              1    0.0363  74.446 -1009.32
## - Married                 1    0.0506  74.460 -1009.22
## - Loan_Amount_Term        1    0.1149  74.525 -1008.77
## - Dependents0             1    0.1294  74.539 -1008.66
## - GenderFemale            1    0.1394  74.549 -1008.59
## - Property_AreaRural      1    0.1460  74.556 -1008.55
## - LoanAmount              1    0.2095  74.619 -1008.10
## - Self_Employed           1    0.2248  74.634 -1007.99
## - CoapplicantIncome       1    0.2254  74.635 -1007.98
## <none>                                 74.410 -1007.58
## - EducationGraduate       1    0.3071  74.717 -1007.40
## - Dependents1             1    0.3347  74.744 -1007.21
## - MarriedNo               1    0.5198  74.929 -1005.90
## - Property_AreaSemiurban  1    1.0663  75.476 -1002.06
## - Credit_History          1   31.3212 105.731  -823.74
## 
## Step:  AIC=-1009.32
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + GenderFemale + Married + MarriedNo + EducationGraduate + 
##     Self_Employed + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Married                 1     0.026  74.472 -1011.14
## - Dependents0             1     0.106  74.551 -1010.57
## - Loan_Amount_Term        1     0.120  74.566 -1010.47
## - Property_AreaRural      1     0.139  74.585 -1010.34
## - GenderFemale            1     0.143  74.589 -1010.31
## - LoanAmount              1     0.204  74.650 -1009.88
## - CoapplicantIncome       1     0.224  74.670 -1009.73
## - Self_Employed           1     0.229  74.675 -1009.70
## <none>                                 74.446 -1009.32
## - Dependents1             1     0.308  74.754 -1009.14
## - EducationGraduate       1     0.319  74.765 -1009.06
## - MarriedNo               1     0.542  74.988 -1007.49
## - Property_AreaSemiurban  1     1.086  75.531 -1003.67
## - Credit_History          1    32.034 106.480  -822.01
## 
## Step:  AIC=-1011.14
## Loan_Status ~ CoapplicantIncome + LoanAmount + Loan_Amount_Term + 
##     Credit_History + GenderFemale + MarriedNo + EducationGraduate + 
##     Self_Employed + Property_AreaRural + Property_AreaSemiurban + 
##     Dependents0 + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Loan_Amount_Term        1     0.112  74.584 -1012.34
## - Dependents0             1     0.115  74.587 -1012.32
## - Property_AreaRural      1     0.144  74.616 -1012.11
## - GenderFemale            1     0.145  74.617 -1012.11
## - LoanAmount              1     0.204  74.677 -1011.69
## - Self_Employed           1     0.228  74.700 -1011.52
## - CoapplicantIncome       1     0.230  74.702 -1011.51
## <none>                                 74.472 -1011.14
## - Dependents1             1     0.321  74.793 -1010.86
## - EducationGraduate       1     0.324  74.797 -1010.84
## - MarriedNo               1     0.546  75.019 -1009.27
## - Property_AreaSemiurban  1     1.082  75.554 -1005.51
## - Credit_History          1    32.111 106.583  -823.49
## 
## Step:  AIC=-1012.34
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History + 
##     GenderFemale + MarriedNo + EducationGraduate + Self_Employed + 
##     Property_AreaRural + Property_AreaSemiurban + Dependents0 + 
##     Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Dependents0             1     0.119  74.703 -1013.50
## - GenderFemale            1     0.160  74.744 -1013.21
## - Property_AreaRural      1     0.161  74.745 -1013.20
## - LoanAmount              1     0.214  74.798 -1012.83
## - CoapplicantIncome       1     0.231  74.815 -1012.71
## - Self_Employed           1     0.235  74.819 -1012.68
## <none>                                 74.584 -1012.34
## - Dependents1             1     0.292  74.876 -1012.28
## - EducationGraduate       1     0.298  74.882 -1012.23
## - MarriedNo               1     0.577  75.161 -1010.27
## - Property_AreaSemiurban  1     1.040  75.624 -1007.02
## - Credit_History          1    32.088 106.672  -825.05
## 
## Step:  AIC=-1013.5
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History + 
##     GenderFemale + MarriedNo + EducationGraduate + Self_Employed + 
##     Property_AreaRural + Property_AreaSemiurban + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Property_AreaRural      1     0.177  74.880 -1014.25
## - GenderFemale            1     0.181  74.884 -1014.22
## - LoanAmount              1     0.182  74.885 -1014.21
## - Dependents1             1     0.183  74.885 -1014.21
## - Self_Employed           1     0.238  74.940 -1013.82
## - CoapplicantIncome       1     0.257  74.959 -1013.69
## <none>                                 74.703 -1013.50
## - EducationGraduate       1     0.284  74.986 -1013.50
## - MarriedNo               1     0.840  75.543 -1009.59
## - Property_AreaSemiurban  1     1.008  75.710 -1008.41
## - Credit_History          1    31.969 106.672  -827.05
## 
## Step:  AIC=-1014.25
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History + 
##     GenderFemale + MarriedNo + EducationGraduate + Self_Employed + 
##     Property_AreaSemiurban + Dependents1
## 
##                          Df Sum of Sq     RSS      AIC
## - Dependents1             1     0.145  75.025 -1015.23
## - GenderFemale            1     0.178  75.058 -1014.99
## - Self_Employed           1     0.218  75.098 -1014.71
## - LoanAmount              1     0.222  75.102 -1014.69
## - CoapplicantIncome       1     0.256  75.136 -1014.45
## <none>                                 74.880 -1014.25
## - EducationGraduate       1     0.330  75.210 -1013.93
## - MarriedNo               1     0.856  75.736 -1010.24
## - Property_AreaSemiurban  1     2.047  76.927 -1001.99
## - Credit_History          1    31.930 106.810  -828.37
## 
## Step:  AIC=-1015.23
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History + 
##     GenderFemale + MarriedNo + EducationGraduate + Self_Employed + 
##     Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS      AIC
## - GenderFemale            1     0.204  75.228 -1015.79
## - Self_Employed           1     0.217  75.242 -1015.70
## - LoanAmount              1     0.242  75.267 -1015.52
## - CoapplicantIncome       1     0.243  75.268 -1015.52
## <none>                                 75.025 -1015.23
## - EducationGraduate       1     0.327  75.351 -1014.93
## - MarriedNo               1     0.775  75.799 -1011.79
## - Property_AreaSemiurban  1     2.053  77.078 -1002.95
## - Credit_History          1    31.892 106.917  -829.84
## 
## Step:  AIC=-1015.79
## Loan_Status ~ CoapplicantIncome + LoanAmount + Credit_History + 
##     MarriedNo + EducationGraduate + Self_Employed + Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS      AIC
## - CoapplicantIncome       1     0.198  75.426 -1016.41
## - Self_Employed           1     0.200  75.428 -1016.39
## - LoanAmount              1     0.224  75.452 -1016.22
## <none>                                 75.228 -1015.79
## - EducationGraduate       1     0.295  75.524 -1015.72
## - MarriedNo               1     1.191  76.419 -1009.49
## - Property_AreaSemiurban  1     1.927  77.155 -1004.42
## - Credit_History          1    32.011 107.240  -830.24
## 
## Step:  AIC=-1016.41
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + EducationGraduate + 
##     Self_Employed + Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS     AIC
## - Self_Employed           1     0.208  75.634 -1017.0
## - EducationGraduate       1     0.277  75.703 -1016.5
## <none>                                 75.426 -1016.4
## - LoanAmount              1     0.289  75.715 -1016.4
## - MarriedNo               1     1.113  76.539 -1010.7
## - Property_AreaSemiurban  1     1.944  77.370 -1004.9
## - Credit_History          1    32.066 107.492  -831.0
## 
## Step:  AIC=-1016.95
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + EducationGraduate + 
##     Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS      AIC
## - EducationGraduate       1     0.265  75.900 -1017.09
## <none>                                 75.634 -1016.95
## - LoanAmount              1     0.287  75.921 -1016.94
## - MarriedNo               1     1.061  76.695 -1011.58
## - Property_AreaSemiurban  1     1.924  77.558 -1005.66
## - Credit_History          1    32.598 108.232  -829.37
## 
## Step:  AIC=-1017.09
## Loan_Status ~ LoanAmount + Credit_History + MarriedNo + Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS      AIC
## - LoanAmount              1     0.204  76.104 -1017.67
## <none>                                 75.900 -1017.09
## - MarriedNo               1     1.015  76.915 -1012.06
## - Property_AreaSemiurban  1     1.976  77.875 -1005.50
## - Credit_History          1    33.153 109.053  -827.37
## 
## Step:  AIC=-1017.67
## Loan_Status ~ Credit_History + MarriedNo + Property_AreaSemiurban
## 
##                          Df Sum of Sq     RSS      AIC
## <none>                                 76.104 -1017.67
## - MarriedNo               1     0.890  76.994 -1013.52
## - Property_AreaSemiurban  1     1.978  78.082 -1006.10
## - Credit_History          1    33.272 109.376  -827.81
# Print the final model
summary(final_model)
## 
## Call:
## lm(formula = Loan_Status ~ Credit_History + MarriedNo + Property_AreaSemiurban, 
##     data = normalized_training_dataset)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.90313 -0.07414  0.09687  0.22199  1.01157 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.07414    0.04638   1.598 0.110547    
## Credit_History          0.70387    0.04646  15.150  < 2e-16 ***
## MarriedNo              -0.08571    0.03459  -2.478 0.013526 *  
## Property_AreaSemiurban  0.12512    0.03387   3.694 0.000244 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3807 on 525 degrees of freedom
## Multiple R-squared:  0.3252, Adjusted R-squared:  0.3213 
## F-statistic: 84.32 on 3 and 525 DF,  p-value: < 2.2e-16

From all the above feature selection methods and based on the business use cases, we can conclude that all methods pointed towards - Applicant Income, Credit History, MarriedYes,MarriedNo_,Education - Graduate, Education - Not Graduate, Self employed - Yes, Self employed No, are the variables that are significant for our analysis.
# DROPPING UNWANTED VARIABLES [COLUMNS]
library(class)
head(normalized_training_dataset)
##   ApplicantIncome CoapplicantIncome LoanAmount Loan_Amount_Term Credit_History
## 2      0.05482993        0.04456660 0.17221418        0.7297297              1
## 3      0.03525046        0.00000000 0.08248915        0.7297297              1
## 4      0.03009276        0.06968703 0.16063676        0.7297297              1
## 5      0.07235622        0.00000000 0.19102750        0.7297297              1
## 6      0.06514533        0.12400627 0.37337192        0.7297297              1
## 7      0.02700062        0.04480303 0.12445731        0.7297297              1
##   Gender GenderFemale GenderMale Married MarriedNo MarriedYes EducationGraduate
## 2      0            0          1       0         0          1                 1
## 3      0            0          1       0         0          1                 1
## 4      0            0          1       0         0          1                 0
## 5      0            0          1       0         1          0                 1
## 6      0            0          1       0         0          1                 1
## 7      0            0          1       0         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 2                     0             0               1                0
## 3                     0             0               0                1
## 4                     1             0               1                0
## 5                     0             0               1                0
## 6                     0             0               0                1
## 7                     1             0               1                0
##   Property_AreaRural Property_AreaSemiurban Property_AreaUrban Dependents
## 2                  1                      0                  0          0
## 3                  0                      0                  1          0
## 4                  0                      0                  1          0
## 5                  0                      0                  1          0
## 6                  0                      0                  1          0
## 7                  0                      0                  1          0
##   Dependents0 Dependents1 Dependents2 Dependents3+ Loan_Status
## 2           0           1           0            0           0
## 3           1           0           0            0           1
## 4           1           0           0            0           1
## 5           1           0           0            0           1
## 6           0           0           1            0           1
## 7           1           0           0            0           1
normalized_class_training_dataset <-normalized_training_dataset[, -c(2,3,4,6,7,8,9,14,17,18,19,20,21,22,23,24)] # Removing Variables and Columns that were not significant enough and choosing only variables required for proceeding with our analysis.
head(normalized_class_training_dataset)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2      0.05482993              1         0          1                 1
## 3      0.03525046              1         0          1                 1
## 4      0.03009276              1         0          1                 0
## 5      0.07235622              1         1          0                 1
## 6      0.06514533              1         0          1                 1
## 7      0.02700062              1         0          1                 0
##   EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2                     0               1                0           0
## 3                     0               0                1           1
## 4                     1               1                0           1
## 5                     0               1                0           1
## 6                     0               0                1           1
## 7                     1               1                0           1

In the following section, visual representations such as PCA,and Pair Matrix have been used to depict the selected variables.
# SELECTED VARIABLE PLOTS:

# 1. PCA FOR SELECTED VARIABLES:
head(normalized_class_training_dataset)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2      0.05482993              1         0          1                 1
## 3      0.03525046              1         0          1                 1
## 4      0.03009276              1         0          1                 0
## 5      0.07235622              1         1          0                 1
## 6      0.06514533              1         0          1                 1
## 7      0.02700062              1         0          1                 0
##   EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2                     0               1                0           0
## 3                     0               0                1           1
## 4                     1               1                0           1
## 5                     0               1                0           1
## 6                     0               0                1           1
## 7                     1               1                0           1
library(FactoMineR)
PCA(normalized_class_training_dataset)

## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 529 individuals, described by 9 variables
## *The results are available in the following objects:
## 
##    name               description                          
## 1  "$eig"             "eigenvalues"                        
## 2  "$var"             "results for the variables"          
## 3  "$var$coord"       "coord. for the variables"           
## 4  "$var$cor"         "correlations variables - dimensions"
## 5  "$var$cos2"        "cos2 for the variables"             
## 6  "$var$contrib"     "contributions of the variables"     
## 7  "$ind"             "results for the individuals"        
## 8  "$ind$coord"       "coord. for the individuals"         
## 9  "$ind$cos2"        "cos2 for the individuals"           
## 10 "$ind$contrib"     "contributions of the individuals"   
## 11 "$call"            "summary statistics"                 
## 12 "$call$centre"     "mean of the variables"              
## 13 "$call$ecart.type" "standard error of the variables"    
## 14 "$call$row.w"      "weights for the individuals"        
## 15 "$call$col.w"      "weights for the variables"

# 2. Pair Matrix for selected variables:
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
pairs.panels(normalized_class_training_dataset[1:9],gap=0,bg=c("red","yellow","blue")[normalized_class_training_dataset$Loan_Status],pch=21)


Performing hyper-parameter tuning using Grid Search method to determine the best ‘k’ value.
library(caret)
# Determining optimum 'k' value:
# 1. Tuning 'k':
colnames(normalized_class_training_dataset)
## [1] "ApplicantIncome"       "Credit_History"        "MarriedNo"            
## [4] "MarriedYes"            "EducationGraduate"     "EducationNot Graduate"
## [7] "Self_EmployedNo"       "Self_EmployedYes"      "Loan_Status"
model <- train(Loan_Status~`ApplicantIncome`+`Credit_History`+`MarriedNo`+`MarriedYes`+`EducationGraduate`+`EducationNot Graduate`+`Self_EmployedNo`+`Self_EmployedYes`, data=normalized_class_training_dataset, method="knn")
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
model
## k-Nearest Neighbors 
## 
## 529 samples
##   8 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 529, 529, 529, 529, 529, 529, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE       Rsquared   MAE      
##   5  0.4471036  0.1726820  0.2947962
##   7  0.4356280  0.1824096  0.3004809
##   9  0.4300371  0.1879652  0.3049117
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

# Testing:
colnames(Testing_dataset)
##  [1] "Loan_ID"           "Gender"            "Married"          
##  [4] "Dependents"        "Education"         "Self_Employed"    
##  [7] "ApplicantIncome"   "CoapplicantIncome" "LoanAmount"       
## [10] "Loan_Amount_Term"  "Credit_History"    "Property_Area"

from the above testing dataset, we must remove the insignificant variables, so we are left only with the relevant variables for our analysis.
Testing_dataset <- Testing_dataset[, -c(1,2,4,8,9,10,12)]
head(Testing_dataset)
##   Married    Education Self_Employed ApplicantIncome Credit_History
## 1     Yes     Graduate            No            5720              1
## 2     Yes     Graduate            No            3076              1
## 3     Yes     Graduate            No            5000              1
## 4     Yes     Graduate            No            2340             NA
## 5      No Not Graduate            No            3276              1
## 6     Yes Not Graduate           Yes            2165              1

# CONVERTING CATEGORICAL VARIABLES TO NUMERIC BY ONE-HOT ENCODING IN TESTING DATASET:
library(caret)
head(Testing_dataset)
##   Married    Education Self_Employed ApplicantIncome Credit_History
## 1     Yes     Graduate            No            5720              1
## 2     Yes     Graduate            No            3076              1
## 3     Yes     Graduate            No            5000              1
## 4     Yes     Graduate            No            2340             NA
## 5      No Not Graduate            No            3276              1
## 6     Yes Not Graduate           Yes            2165              1
dummy_Married_Test <- dummyVars(~Married, data=Testing_dataset)
dummy_Education_Test <- dummyVars(~Education, data=Testing_dataset)
dummy_Self_Employed_Test <- dummyVars(~Self_Employed, data=Testing_dataset)
encoded_Testing_dataset <- cbind(Testing_dataset,
                          
                                  predict(dummy_Married_Test,Testing_dataset),
                                  predict(dummy_Education_Test,Testing_dataset),
                                  predict(dummy_Self_Employed_Test,Testing_dataset))
head(encoded_Testing_dataset)
##   Married    Education Self_Employed ApplicantIncome Credit_History MarriedNo
## 1     Yes     Graduate            No            5720              1         0
## 2     Yes     Graduate            No            3076              1         0
## 3     Yes     Graduate            No            5000              1         0
## 4     Yes     Graduate            No            2340             NA         0
## 5      No Not Graduate            No            3276              1         1
## 6     Yes Not Graduate           Yes            2165              1         0
##   MarriedYes EducationGraduate EducationNot Graduate Self_Employed
## 1          1                 1                     0             0
## 2          1                 1                     0             0
## 3          1                 1                     0             0
## 4          1                 1                     0             0
## 5          0                 0                     1             0
## 6          1                 0                     1             0
##   Self_EmployedNo Self_EmployedYes
## 1               1                0
## 2               1                0
## 3               1                0
## 4               1                0
## 5               1                0
## 6               0                1

# REMOVE ENCODED COLUMNS:
encoded_Testing_dataset <- encoded_Testing_dataset[, -c(1,2,3)]
head(encoded_Testing_dataset)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1            5720              1         0          1                 1
## 2            3076              1         0          1                 1
## 3            5000              1         0          1                 1
## 4            2340             NA         0          1                 1
## 5            3276              1         1          0                 0
## 6            2165              1         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 1                     0             0               1                0
## 2                     0             0               1                0
## 3                     0             0               1                0
## 4                     0             0               1                0
## 5                     1             0               1                0
## 6                     1             0               0                1

# NORMALIZE THE TESTING SET:
summary(encoded_Testing_dataset)
##  ApplicantIncome Credit_History     MarriedNo        MarriedYes    
##  Min.   :    0   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 2864   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 3786   Median :1.0000   Median :0.0000   Median :1.0000  
##  Mean   : 4806   Mean   :0.8254   Mean   :0.3651   Mean   :0.6349  
##  3rd Qu.: 5060   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :72529   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                  NA's   :29                                        
##  EducationGraduate EducationNot Graduate Self_Employed     Self_EmployedNo 
##  Min.   :0.0000    Min.   :0.0000        Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:1.0000    1st Qu.:0.0000        1st Qu.:0.00000   1st Qu.:1.0000  
##  Median :1.0000    Median :0.0000        Median :0.00000   Median :1.0000  
##  Mean   :0.7711    Mean   :0.2289        Mean   :0.06267   Mean   :0.8365  
##  3rd Qu.:1.0000    3rd Qu.:0.0000        3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000    Max.   :1.0000        Max.   :1.00000   Max.   :1.0000  
##                                                                            
##  Self_EmployedYes
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1008  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
## 
encoded_Testing_dataset_norm <- preProcess(encoded_Testing_dataset,method = c('range'))
normalized_Testing_dataset <- predict(encoded_Testing_dataset_norm,encoded_Testing_dataset)
head(normalized_Testing_dataset)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1      0.07886501              1         0          1                 1
## 2      0.04241062              1         0          1                 1
## 3      0.06893794              1         0          1                 1
## 4      0.03226296             NA         0          1                 1
## 5      0.04516814              1         1          0                 0
## 6      0.02985013              1         0          1                 0
##   EducationNot Graduate Self_Employed Self_EmployedNo Self_EmployedYes
## 1                     0             0               1                0
## 2                     0             0               1                0
## 3                     0             0               1                0
## 4                     0             0               1                0
## 5                     1             0               1                0
## 6                     1             0               0                1
summary(normalized_Testing_dataset)
##  ApplicantIncome   Credit_History     MarriedNo        MarriedYes    
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.03949   1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.05220   Median :1.0000   Median :0.0000   Median :1.0000  
##  Mean   :0.06626   Mean   :0.8254   Mean   :0.3651   Mean   :0.6349  
##  3rd Qu.:0.06977   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##                    NA's   :29                                        
##  EducationGraduate EducationNot Graduate Self_Employed     Self_EmployedNo 
##  Min.   :0.0000    Min.   :0.0000        Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:1.0000    1st Qu.:0.0000        1st Qu.:0.00000   1st Qu.:1.0000  
##  Median :1.0000    Median :0.0000        Median :0.00000   Median :1.0000  
##  Mean   :0.7711    Mean   :0.2289        Mean   :0.06267   Mean   :0.8365  
##  3rd Qu.:1.0000    3rd Qu.:0.0000        3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.0000    Max.   :1.0000        Max.   :1.00000   Max.   :1.0000  
##                                                                            
##  Self_EmployedYes
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1008  
##  3rd Qu.:0.0000  
##  Max.   :1.0000  
## 

Performing K-NN for K values - k=5,7,9
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.rstudio.com"))

# Install necessary packages
install.packages("readxl", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("class", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("e1071", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("caret", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("ggplot2", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("reshape2", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
install.packages("pROC", dependencies = TRUE)
## 
## The downloaded binary packages are in
##  /var/folders/qf/zm1f11m105v4s5y5sf6c_6yr0000gn/T//RtmpnQ2Nqp/downloaded_packages
# Load necessary libraries
library(readxl)
library(class)
library(e1071)
library(caret)
library(ggplot2)
library(reshape2)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
train_set <- normalized_class_training_dataset
test_set <- normalized_Testing_dataset
test_set <- test_set[,-7]
head(train_set)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 2      0.05482993              1         0          1                 1
## 3      0.03525046              1         0          1                 1
## 4      0.03009276              1         0          1                 0
## 5      0.07235622              1         1          0                 1
## 6      0.06514533              1         0          1                 1
## 7      0.02700062              1         0          1                 0
##   EducationNot Graduate Self_EmployedNo Self_EmployedYes Loan_Status
## 2                     0               1                0           0
## 3                     0               0                1           1
## 4                     1               1                0           1
## 5                     0               1                0           1
## 6                     0               0                1           1
## 7                     1               1                0           1
head(test_set)
##   ApplicantIncome Credit_History MarriedNo MarriedYes EducationGraduate
## 1      0.07886501              1         0          1                 1
## 2      0.04241062              1         0          1                 1
## 3      0.06893794              1         0          1                 1
## 4      0.03226296             NA         0          1                 1
## 5      0.04516814              1         1          0                 0
## 6      0.02985013              1         0          1                 0
##   EducationNot Graduate Self_EmployedNo Self_EmployedYes
## 1                     0               1                0
## 2                     0               1                0
## 3                     0               1                0
## 4                     0               1                0
## 5                     1               1                0
## 6                     1               0                1
test_set <- na.omit(test_set)
# Separate features and target variable in the training data
train_features <- train_set[, -which(names(train_set) == "Loan_Status")]
train_target <- train_set$Loan_Status
# Features in the testing data
test_features <- test_set

# Model's Performance when k=5
knn_predictions_k5 <- knn(train = train_features, test = test_features, cl = train_target, k = 5)
knn_predictions_k5
##   [1] 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1
##  [75] 1 0 0 0 1 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 1 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
## [149] 0 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 1 0 0 0 1
## [223] 1 1 0 1 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 1 0 1 1 1 1 1 0 1
## [297] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 0 1 0 1 1
## [334] 1 0 1 1 1
## Levels: 0 1

# Model's Performance when k=7
knn_predictions_k7 <- knn(train = train_features, test = test_features, cl = train_target, k = 7)
knn_predictions_k7
##   [1] 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1
##  [75] 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1
## [149] 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
## [223] 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1
## [297] 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1
## [334] 1 1 1 1 1
## Levels: 0 1

# Model's Performance when k=9
knn_predictions_k9 <- knn(train = train_features, test = test_features, cl = train_target, k = 9)
knn_predictions_k9
##   [1] 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 1
##  [75] 1 0 1 0 1 0 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0
## [112] 1 1 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 1
## [149] 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 1 0 1 0 1
## [186] 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1
## [223] 1 1 0 1 0 1 0 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 0 1 1 1 0
## [260] 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1
## [297] 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1
## [334] 1 1 1 1 1
## Levels: 0 1

Using Confusion Matrix to compare all potential k values,i.e k= 5,7 & 9. Based on these metrics, k = 9 seems to be the best choice.
# Create a validation set from the training data since the testing dataset does not have a target variable:
set.seed(123)
trainIndex <- createDataPartition(train_target, p = .8, 
                                  list = FALSE, 
                                  times = 1)
train_set_train <- train_set[trainIndex,]
train_set_val <- train_set[-trainIndex,]

# Separate features and target variable in the validation data
val_features <- train_set_val[, -which(names(train_set_val) == "Loan_Status")]
val_target <- train_set_val$Loan_Status

# Function to train and evaluate k-NN for a given k
evaluate_knn <- function(k) {
  knn_val_predictions <- knn(train = train_set_train[, -which(names(train_set_train) == "Loan_Status")], 
                             test = val_features, 
                             cl = train_set_train$Loan_Status, 
                             k = k)
  
  # Convert predictions and actual values to factors with the same levels
  val_target <- factor(val_target) # Ensure val_target is a factor
  knn_val_predictions <- factor(knn_val_predictions, levels = levels(val_target))
  
  # Confusion matrix for k-NN
  knn_conf_matrix <- confusionMatrix(knn_val_predictions, val_target)
  return(knn_conf_matrix)
}

# Evaluate k-NN for k=5, k=7, and k=9
knn_conf_matrix_k5 <- evaluate_knn(5)
knn_conf_matrix_k7 <- evaluate_knn(7)
knn_conf_matrix_k9 <- evaluate_knn(9)

# Print the confusion matrices
print(knn_conf_matrix_k5)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 15  7
##          1 15 68
##                                           
##                Accuracy : 0.7905          
##                  95% CI : (0.7001, 0.8638)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.04946         
##                                           
##                   Kappa : 0.442           
##                                           
##  Mcnemar's Test P-Value : 0.13559         
##                                           
##             Sensitivity : 0.5000          
##             Specificity : 0.9067          
##          Pos Pred Value : 0.6818          
##          Neg Pred Value : 0.8193          
##              Prevalence : 0.2857          
##          Detection Rate : 0.1429          
##    Detection Prevalence : 0.2095          
##       Balanced Accuracy : 0.7033          
##                                           
##        'Positive' Class : 0               
## 
print(knn_conf_matrix_k7)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 13  4
##          1 17 71
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7107, 0.8717)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.030009        
##                                           
##                   Kappa : 0.4368          
##                                           
##  Mcnemar's Test P-Value : 0.008829        
##                                           
##             Sensitivity : 0.4333          
##             Specificity : 0.9467          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 0.8068          
##              Prevalence : 0.2857          
##          Detection Rate : 0.1238          
##    Detection Prevalence : 0.1619          
##       Balanced Accuracy : 0.6900          
##                                           
##        'Positive' Class : 0               
## 
print(knn_conf_matrix_k9)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 13  4
##          1 17 71
##                                           
##                Accuracy : 0.8             
##                  95% CI : (0.7107, 0.8717)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.030009        
##                                           
##                   Kappa : 0.4368          
##                                           
##  Mcnemar's Test P-Value : 0.008829        
##                                           
##             Sensitivity : 0.4333          
##             Specificity : 0.9467          
##          Pos Pred Value : 0.7647          
##          Neg Pred Value : 0.8068          
##              Prevalence : 0.2857          
##          Detection Rate : 0.1238          
##    Detection Prevalence : 0.1619          
##       Balanced Accuracy : 0.6900          
##                                           
##        'Positive' Class : 0               
## 

# Confusion Matrix for k=5,7,9
plot_confusion_matrix <- function(cm, title) {
  cm_matrix <- as.data.frame(cm$table)
  colnames(cm_matrix) <- c("Prediction", "Reference", "Count")
  
  ggplot(data = cm_matrix, aes(x = Reference, y = Prediction)) +
    geom_tile(aes(fill = Count), color = "white") +
    geom_text(aes(label = Count), vjust = 1) +
    scale_fill_gradient(low = "white", high = "steelblue") +
    theme_minimal() +
    labs(title = title, x = "Actual", y = "Predicted")
}

# Plot confusion matrices for k=5, k=7, and k=9
plot_confusion_matrix(knn_conf_matrix_k5, "Confusion Matrix for k-NN (k=5)")

plot_confusion_matrix(knn_conf_matrix_k7, "Confusion Matrix for k-NN (k=7)")

plot_confusion_matrix(knn_conf_matrix_k9, "Confusion Matrix for k-NN (k=9)")


Performing Naive Bayes Classification Algorithm to predict the outcome:
# Train the Naive Bayes model
nb_model <- naiveBayes(Loan_Status ~ ., data = train_set_train)

# Make predictions on the validation data
nb_val_predictions <- predict(nb_model, val_features)

# Convert predictions and actual values to factors with the same levels
val_target <- factor(val_target) # Ensure val_target is a factor
nb_val_predictions <- factor(nb_val_predictions, levels = levels(val_target))

# Confusion matrix for Naive Bayes
nb_conf_matrix <- confusionMatrix(nb_val_predictions, val_target)
print(nb_conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 13  5
##          1 17 70
##                                           
##                Accuracy : 0.7905          
##                  95% CI : (0.7001, 0.8638)
##     No Information Rate : 0.7143          
##     P-Value [Acc > NIR] : 0.04946         
##                                           
##                   Kappa : 0.4167          
##                                           
##  Mcnemar's Test P-Value : 0.01902         
##                                           
##             Sensitivity : 0.4333          
##             Specificity : 0.9333          
##          Pos Pred Value : 0.7222          
##          Neg Pred Value : 0.8046          
##              Prevalence : 0.2857          
##          Detection Rate : 0.1238          
##    Detection Prevalence : 0.1714          
##       Balanced Accuracy : 0.6833          
##                                           
##        'Positive' Class : 0               
## 
# Plot Naive Bayes confusion matrix
plot_confusion_matrix(nb_conf_matrix, "Confusion Matrix for Naive Bayes")


Calculating the performance metrics for both K-NN & Naive Bayes Algorithms:
# performance metrics from confusion matrix
calculate_metrics <- function(cm) {
  accuracy <- cm$overall['Accuracy']
  recall <- cm$byClass['Sensitivity']
  precision <- cm$byClass['Pos Pred Value']
  specificity <- cm$byClass['Specificity']
  
  return(data.frame(Accuracy = accuracy, Recall = recall, Precision = precision, Specificity = specificity))
}

# Calculate metrics for k-NN with k=5, k=7, and k=9
metrics_k5 <- calculate_metrics(knn_conf_matrix_k5)
metrics_k7 <- calculate_metrics(knn_conf_matrix_k7)
metrics_k9 <- calculate_metrics(knn_conf_matrix_k9)

# Calculate metrics for Naive Bayes
metrics_nb <- calculate_metrics(nb_conf_matrix)

# Print metrics
print(metrics_k5)
##           Accuracy Recall Precision Specificity
## Accuracy 0.7904762    0.5 0.6818182   0.9066667
print(metrics_k7)
##          Accuracy    Recall Precision Specificity
## Accuracy      0.8 0.4333333 0.7647059   0.9466667
print(metrics_k9)
##          Accuracy    Recall Precision Specificity
## Accuracy      0.8 0.4333333 0.7647059   0.9466667
print(metrics_nb)
##           Accuracy    Recall Precision Specificity
## Accuracy 0.7904762 0.4333333 0.7222222   0.9333333

Combine metrics into a single data frame for comparison
all_metrics <- rbind(
  data.frame(Model = "k-NN (k=5)", metrics_k5),
  data.frame(Model = "k-NN (k=7)", metrics_k7),
  data.frame(Model = "k-NN (k=9)", metrics_k9),
  data.frame(Model = "Naive Bayes", metrics_nb)
)
print(all_metrics)
##                 Model  Accuracy    Recall Precision Specificity
## Accuracy   k-NN (k=5) 0.7904762 0.5000000 0.6818182   0.9066667
## Accuracy1  k-NN (k=7) 0.8000000 0.4333333 0.7647059   0.9466667
## Accuracy2  k-NN (k=9) 0.8000000 0.4333333 0.7647059   0.9466667
## Accuracy3 Naive Bayes 0.7904762 0.4333333 0.7222222   0.9333333